library(stringr)
library(ggplot2)
library(dplyr)
library(GGally)
library(reshape2)
library(fastDummies)
library(caret)
library(Matrix)
library(glmnet)
library(car)
library(rpart)
CODE 1
# Load the dataset
data <- read.csv("./Employee Turnover.csv")
# print 5 sample rows
sample_n(data, 5)
Code 2
# print data structure
str(data)
'data.frame': 1470 obs. of 32 variables:
$ Age : int 41 49 37 33 27 32 59 30 38 36 ...
$ BusinessTravel : chr "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
$ Department : chr "Sales" "Research & Development" "Research & Development" "Research & Development" ...
$ DistanceFromHome : int 1 8 2 3 2 2 3 24 23 27 ...
$ Education : int 2 1 2 4 1 2 3 1 3 3 ...
$ EducationField : chr "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
$ EmployeeNumber : int 1 2 4 5 7 8 10 11 12 13 ...
$ EnvironmentSatisfaction : int 2 3 4 4 1 4 3 4 4 3 ...
$ Gender : chr "Female" "Male" "Male" "Female" ...
$ JobInvolvement : int 3 2 2 3 3 3 4 3 2 3 ...
$ JobLevel : int 2 2 1 1 1 1 1 1 3 2 ...
$ JobRole : chr "Sales Executive" "Research Scientist" "Laboratory Technician" "Research Scientist" ...
$ JobSatisfaction : int 4 2 3 3 2 4 1 3 3 3 ...
$ MaritalStatus : chr "Single" "Married" "Single" "Married" ...
$ MonthlyIncome : int 5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
$ NumCompaniesWorked : int 8 1 6 1 9 0 4 1 0 6 ...
$ OverTime : chr "Yes" "No" "Yes" "Yes" ...
$ PercentSalaryHike : int 11 23 15 11 12 13 20 22 21 13 ...
$ PerformanceRating : int 3 4 3 3 3 3 4 4 4 3 ...
$ RelationshipSatisfaction: int 1 4 2 3 4 3 1 2 2 2 ...
$ Status : chr "Terminated" "Active" "Terminated" "Active" ...
$ StockOptionLevel : int 0 1 0 0 1 0 3 1 0 2 ...
$ TotalWorkingYears : int 8 10 7 8 6 8 12 1 10 17 ...
$ TrainingTimesLastYear : int 0 3 3 3 3 2 3 2 2 3 ...
$ WorkLifeBalance : int 1 3 3 3 3 2 2 3 3 2 ...
$ YearsAtCompany : int 6 10 0 8 2 7 1 1 9 7 ...
$ YearsInCurrentRole : int 4 7 0 7 2 7 0 0 7 7 ...
$ YearsSinceLastPromotion : int 0 1 0 3 2 3 0 0 1 7 ...
$ YearsWithCurrManager : int 5 7 0 0 2 6 0 0 8 7 ...
$ TurnoverType : chr "Voluntary" "" "Voluntary" "" ...
$ TurnoverReason : chr "Resignation" "" "Resignation" "" ...
$ Location : chr "Dallas" "Zurich" "Zurich" "Tokyo" ...
Code 3
# Check if all EmployeeNumber values are unique
cat('EmployeeNumber values are unique:', (length(unique(data$EmployeeNumber)) == nrow(data)))
EmployeeNumber values are unique: TRUE
data <- subset(data, select = -EmployeeNumber)
Code 4
# Calculate the number of missing values per column
missing_values <- sapply(data, function(x) sum(is.na(x)))
# Filter columns with missing values greater than 0
missing_values <- missing_values[missing_values > 0]
# Check if there are any missing values and print the result
if (length(missing_values) == 0) {
print("There are no explicit missing values")
} else {
print(missing_values)
}
[1] "There are no explicit missing values"
Code 5
# Identify character columns
char_columns <- sapply(data, is.character)
char_columns_names <- names(data)[char_columns]
# Print value counts for each character column that has an empty string ""
for (col_name in char_columns_names) {
if ("" %in% data[[col_name]]) {
cat(paste("Value counts for ", col_name, ":", sep = ""))
print(table(data[[col_name]]))
cat("\n")
}
}
Value counts for TurnoverType:
Involuntary Voluntary
1233 14 223
Value counts for TurnoverReason:
Layoff Resignation Retirement
1233 14 214 9
# Create logical vectors for empty string checks
empty_turnover_type <- data$TurnoverType == ""
empty_turnover_reason <- data$TurnoverReason == ""
# Check if every time one is empty, the other is also empty
if (all(empty_turnover_type == empty_turnover_reason)) {
print("Empty strings in TurnoverType and TurnoverReason match on the same rows every time.")
# Replace empty strings with "StillEmployed" in TurnoverType
data$TurnoverType <- ifelse(data$TurnoverType == "", "StillEmployed", data$TurnoverType)
# Replace empty strings with "StillEmployed" in TurnoverReason
data$TurnoverReason <- ifelse(data$TurnoverReason == "", "StillEmployed", data$TurnoverReason)
} else {
print("Empty strings in TurnoverType and TurnoverReason do not match on the same rows every time.")
}
[1] "Empty strings in TurnoverType and TurnoverReason match on the same rows every time."
Code 6
# Define the lists of columns
numerical_cols <- c("Age", "MonthlyIncome", "PercentSalaryHike","DistanceFromHome",
"TotalWorkingYears", "YearsAtCompany", "YearsInCurrentRole",
"YearsSinceLastPromotion", "YearsWithCurrManager",
"TrainingTimesLastYear", "NumCompaniesWorked"
)
ordinal_cols <- c( "Education", "StockOptionLevel", "BusinessTravel",
"JobLevel", "PerformanceRating", "JobInvolvement","JobSatisfaction",
"RelationshipSatisfaction", "EnvironmentSatisfaction", "WorkLifeBalance")
nominal_cols <- c("Gender", "MaritalStatus", "EducationField", "Department", "JobRole",
"OverTime", "Location", "Status", "TurnoverType", "TurnoverReason")
categorical_cols <- c("Gender", "MaritalStatus", "Education", "EducationField", "StockOptionLevel",
"BusinessTravel","Department", "JobRole", "OverTime",
"JobLevel", "PerformanceRating", "JobInvolvement","JobSatisfaction",
"RelationshipSatisfaction", "EnvironmentSatisfaction", "WorkLifeBalance",
"Location", "Status", "TurnoverType", "TurnoverReason")
# Transform ordinal variables into ordered factors
data$BusinessTravel <- factor(data$BusinessTravel, levels = c("Non-Travel", "Travel_Rarely", "Travel_Frequently"), ordered = TRUE)
data$Education <- factor(data$Education, levels = 1:5,ordered = TRUE)
data$EnvironmentSatisfaction <- factor(data$EnvironmentSatisfaction, levels = 1:4, ordered = TRUE)
data$JobInvolvement <- factor(data$JobInvolvement,levels = 1:4, ordered = TRUE)
data$JobLevel <- factor(data$JobLevel, levels = 1:5, ordered = TRUE)
data$JobSatisfaction <- factor(data$JobSatisfaction, levels = 1:4, ordered = TRUE)
data$PerformanceRating <- factor(data$PerformanceRating, levels = 1:4, ordered = TRUE)
data$RelationshipSatisfaction <- factor(data$RelationshipSatisfaction, levels = 1:4, ordered = TRUE)
data$StockOptionLevel <- factor(data$StockOptionLevel, levels = 0:3, ordered = TRUE)
data$WorkLifeBalance <- factor(data$WorkLifeBalance, levels = 1:4, ordered = TRUE)
# Transform nominal variables into non-ordered factors
data$Department <- factor(data$Department)
data$EducationField <- factor(data$EducationField)
data$Gender <- factor(data$Gender)
data$JobRole <- factor(data$JobRole)
data$MaritalStatus <- factor(data$MaritalStatus)
data$OverTime <- factor(data$OverTime)
data$Location <- factor(data$Location)
data$Status <- factor(data$Status)
data$TurnoverType <- factor(data$TurnoverType)
data$TurnoverReason <- factor(data$TurnoverReason)
# New order of columns
new_column_order <- c("Age", "Gender", "MaritalStatus",
"EducationField", "Education",
"Location", "DistanceFromHome",
"MonthlyIncome", "PercentSalaryHike", "StockOptionLevel",
"Department", "JobRole", "JobLevel",
"WorkLifeBalance", "BusinessTravel", "OverTime", "PerformanceRating", "TrainingTimesLastYear", "JobInvolvement",
"TotalWorkingYears", "YearsAtCompany", "YearsInCurrentRole", "YearsSinceLastPromotion", "YearsWithCurrManager", "NumCompaniesWorked",
"RelationshipSatisfaction", "EnvironmentSatisfaction", "JobSatisfaction",
"Status", "TurnoverType", "TurnoverReason")
# Reorder the columns in the dataframe
data <- data[, new_column_order]
Code 7
# Summary statistics for numerical variables
print('Numerical Values')
[1] "Numerical Values"
numerical_summary <- summary(data[, numerical_cols])
print(numerical_summary)
Age MonthlyIncome PercentSalaryHike DistanceFromHome TotalWorkingYears YearsAtCompany YearsInCurrentRole
Min. :18.00 Min. : 1009 Min. :11.00 Min. : 1.000 Min. : 0.00 Min. : 0.000 Min. : 0.000
1st Qu.:30.00 1st Qu.: 2911 1st Qu.:12.00 1st Qu.: 2.000 1st Qu.: 6.00 1st Qu.: 3.000 1st Qu.: 2.000
Median :36.00 Median : 4919 Median :14.00 Median : 7.000 Median :10.00 Median : 5.000 Median : 3.000
Mean :36.92 Mean : 6503 Mean :15.21 Mean : 9.193 Mean :11.28 Mean : 7.008 Mean : 4.229
3rd Qu.:43.00 3rd Qu.: 8379 3rd Qu.:18.00 3rd Qu.:14.000 3rd Qu.:15.00 3rd Qu.: 9.000 3rd Qu.: 7.000
Max. :60.00 Max. :19999 Max. :25.00 Max. :29.000 Max. :40.00 Max. :40.000 Max. :18.000
YearsSinceLastPromotion YearsWithCurrManager TrainingTimesLastYear NumCompaniesWorked
Min. : 0.000 Min. : 0.000 Min. :0.000 Min. :0.000
1st Qu.: 0.000 1st Qu.: 2.000 1st Qu.:2.000 1st Qu.:1.000
Median : 1.000 Median : 3.000 Median :3.000 Median :2.000
Mean : 2.188 Mean : 4.123 Mean :2.799 Mean :2.693
3rd Qu.: 3.000 3rd Qu.: 7.000 3rd Qu.:3.000 3rd Qu.:4.000
Max. :15.000 Max. :17.000 Max. :6.000 Max. :9.000
# Summary statistics for ordinal variables
print('Ordinal Values')
[1] "Ordinal Values"
ordinal_data <- data[, ordinal_cols]
ordinal_summary <- sapply(ordinal_data, function(x) summary(as.numeric(as.factor(x))))
print(ordinal_summary)
Education StockOptionLevel BusinessTravel JobLevel PerformanceRating JobInvolvement JobSatisfaction
Min. 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
1st Qu. 2.000000 1.000000 2.000000 1.000000 3.000000 2.000000 2.000000
Median 3.000000 2.000000 2.000000 2.000000 3.000000 3.000000 3.000000
Mean 2.912925 1.793878 2.086395 2.063946 3.091156 2.729932 2.728571
3rd Qu. 4.000000 2.000000 2.000000 3.000000 3.000000 3.000000 4.000000
Max. 5.000000 4.000000 3.000000 5.000000 4.000000 4.000000 4.000000
RelationshipSatisfaction EnvironmentSatisfaction WorkLifeBalance
Min. 1.000000 1.000000 1.000000
1st Qu. 2.000000 2.000000 2.000000
Median 3.000000 3.000000 3.000000
Mean 2.712245 2.721769 2.761224
3rd Qu. 4.000000 4.000000 3.000000
Max. 4.000000 4.000000 4.000000
# Summarize nominal variables
print('Nominal Values')
[1] "Nominal Values"
nominal_summary <- summary(data[, nominal_cols])
print(nominal_summary)
Gender MaritalStatus EducationField Department JobRole OverTime
Female:588 Divorced:327 Human Resources : 27 Human Resources : 63 Sales Executive :326 No :1054
Male :882 Married :673 Life Sciences :606 Research & Development:961 Research Scientist :292 Yes: 416
Single :470 Marketing :159 Sales :446 Laboratory Technician :259
Medical :464 Manufacturing Director :145
Other : 82 Healthcare Representative:131
Technical Degree:132 Manager :102
(Other) :215
Location Status TurnoverType TurnoverReason
Dallas:366 Active :1233 Involuntary : 14 Layoff : 14
Tokyo :387 Terminated: 237 StillEmployed:1233 Resignation : 214
Zurich:717 Voluntary : 223 Retirement : 9
StillEmployed:1233
Code 8
# Combined histograms with KDE and boxplots for numerical variables
for (col in numerical_cols) {
# Set up the plotting area to have 1 row and 2 columns
par(mfrow=c(1, 2))
# First plot: Histogram with KDE
if (col == "TrainingTimesLastYear") {
hist(data[[col]], main="Histogram", xlab="", breaks=6, freq=FALSE)
dens <- density(data[[col]], bw = 3 * bw.nrd0(data[[col]]), na.rm = TRUE)
} else if (col == "YearsAtCompany") {
hist(data[[col]], main="Histogram", xlab="", breaks=40, freq=FALSE)
dens <- density(data[[col]], bw = 3 * bw.nrd0(data[[col]]), na.rm = TRUE)
} else if (col == "YearsInCUrrentRole") {
hist(data[[col]], main="Histogram", xlab="", breaks=18, freq=FALSE)
dens <- density(data[[col]], bw = 3 * bw.nrd0(data[[col]]), na.rm = TRUE)
} else if (col == "DistanceFromHome") {
hist(data[[col]], main="Histogram", xlab="", breaks=15, freq=FALSE)
dens <- density(data[[col]], bw = 3 * bw.nrd0(data[[col]]), na.rm = TRUE)
} else if (col == "TotalWorkingYears") {
hist(data[[col]], main="Histogram", xlab="", breaks=40, freq=FALSE)
dens <- density(data[[col]], bw = 3 * bw.nrd0(data[[col]]), na.rm = TRUE)
} else {
hist(data[[col]], main="Histogram", xlab="", freq=FALSE)
dens <- density(data[[col]], na.rm = TRUE)
}
lines(dens, col="blue")
# Second plot: Boxplot
boxplot(data[[col]], main="Boxplot", las=2)
# Add a general title for the set of plots with the column name
title(paste("Distribution of", col), outer=TRUE, line=-1, cex.main=1.5)
}
# Bar plots for categorical variables
for (col in categorical_cols) {
barplot(table(data[[col]]), main=paste("Bar Plot of", col), las=2)
}
Code 9
# Convert Ordinal Variables to Numeric and Combine with Numerical Variables
numeric_and_ordinal <- cbind(data[, numerical_cols], sapply(data[, ordinal_cols], as.numeric))
# Calculate the correlation matrix
correlation_matrix <- cor(numeric_and_ordinal, use = "complete.obs")
# Melt the correlation matrix into a long format
cor_melted <- melt(correlation_matrix)
# Filter out the lower triangle and diagonal
cor_melted <- cor_melted[upper.tri(correlation_matrix, diag = FALSE), ]
# Create a heatmap with values
ggplot(cor_melted, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = sprintf("%.2f", value)), size = 1.5) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Pearson\nCorrelation") +
theme_minimal() +
theme(axis.title = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 0, vjust = 0.5),
axis.text.y = element_text(hjust = 1)) +
scale_x_discrete(position = "top")
Code 10
# Create OHE dataset
ohe_data <- data[, numerical_cols]
ohe_data <- cbind(ohe_data, sapply(data[, ordinal_cols], as.numeric))
ohe_data <- cbind(ohe_data, data[, nominal_cols])
ohe_data <- dummy_cols(ohe_data, remove_first_dummy = FALSE, remove_selected_columns = TRUE)
# Calculate the correlation matrix
correlation_matrix <- cor(ohe_data)
# Melt the correlation matrix into a long format
cor_melted <- melt(correlation_matrix)
# Filter out the lower triangle and diagonal
cor_melted <- cor_melted[upper.tri(correlation_matrix, diag = FALSE), ]
# ggplot code for the heatmap
ggplot(cor_melted, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = sprintf("%.2f", value)), size = 1.5) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Pearson\nCorrelation") +
theme_minimal() +
theme(axis.title = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 0, vjust = 0.5), # Set properties for x-axis text here
axis.text.y = element_text(hjust = 1)) +
scale_x_discrete(position = "top")
Code 11
# Create a grouped bar plot
ggplot(data, aes(x = StockOptionLevel, fill = MaritalStatus)) +
geom_bar(position = position_dodge()) +
labs(title = "Relationship between StockOptionLevel and MaritalStatus",
x = "StockOptionLevel",
y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Code 12
# Create a grouped bar plot
ggplot(data, aes(x = JobRole, fill = Department)) +
geom_bar(position = position_dodge()) +
labs(title = "Relationship between JobRole, and Department",
x = "JobRole",
y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 0))
Code 13
# Rename Departments
data <- data %>%
mutate(Department = factor(case_when(
Department == "Human Resources" ~ "HR",
Department == "Research & Development" ~ "R&D",
TRUE ~ as.character(Department) # Keeps all other values as they are
)))
# Merge department and job role
data$JobRole <- paste(data$Department, "-", data$JobRole)
# Drop Department
data <- subset(data, select = -Department)
Code 14
# Create a grouped bar plot
ggplot(data, aes(x = JobRole, fill = EducationField)) +
geom_bar(position = position_dodge()) +
labs(title = "Relationship between JobRole, and EducationField",
x = "JobRole",
y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 0))
Code 15
# Create a grouped bar plot
ggplot(data, aes(x = JobLevel, fill = JobLevel)) +
geom_bar(position = position_dodge()) +
facet_wrap(~ JobRole) +
labs(title = "Relationship between JobRole, and JobLevel",
x = "JobLevel",
y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 0))
Code 16
# Create a grouped bar plot
ggplot(data, aes(x = YearsAtCompany, fill = JobLevel)) +
geom_bar(position = position_dodge()) +
# facet_wrap(~ JobRole) +
labs(title = "Relationship between JobRole, YearsAtCompany and JobLevel",
x = "YearsAtCompany",
y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 0))
Code 17
# Create a grouped bar plot
ggplot(data, aes(x = YearsAtCompany, fill = JobLevel)) +
geom_bar(position = position_dodge()) +
facet_wrap(~ JobRole) +
labs(title = "Relationship between JobRole, YearsAtCompany and JobLevel",
x = "YearsAtCompany",
y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 0))
Code 18
# Create a grouped bar plot
ggplot(data, aes(x = TurnoverReason, fill = TurnoverType)) +
geom_bar(position = position_dodge()) +
facet_wrap(~ Status) +
labs(title = "Relationship between TurnoverReason, TurnoverType, and Status",
x = "Turnover Reason",
y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Code 19
# Drop Turnover and Status
data <- subset(data, select = -c(TurnoverType, Status))
# Remove unused column from nominal columns and ohe dataset
nominal_cols <- setdiff(nominal_cols, c("Department", "TurnoverType", "Status"))
ohe_data <- data[, numerical_cols]
ohe_data <- cbind(ohe_data, sapply(data[, ordinal_cols], as.numeric))
ohe_data <- cbind(ohe_data, data[nominal_cols])
ohe_data <- dummy_cols(ohe_data, remove_first_dummy = FALSE, remove_selected_columns = TRUE)
Code 20
# Calculate the correlation matrix
cor_matrix <- cor(ohe_data)
# Selecting the last 4 rows and dropping the last 4 columns
selected_rows = tail(cor_matrix, 4)
result = selected_rows[, -((ncol(selected_rows)-3):ncol(selected_rows))]
# Melting the data into a long format suitable for ggplot
layoff_correlations = melt(result)
# Plotting the heatmap
ggplot(layoff_correlations, aes(Var2, Var1, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = sprintf("%.2f", value)), size = 1.5) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Pearson\nCorrelation") +
theme_minimal() +
theme(axis.title = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 0, vjust = 0.5), # Properties for x-axis text
axis.text.y = element_text(hjust = 1)) +
scale_x_discrete(position = "top")
Code 21
# Iterating over each row
for (i in 1:nrow(result)) {
# Extract the current row
current_row = result[i, ]
row_name = rownames(result)[i]
# Finding correlations over 0.15
correlations_over_015 = current_row[current_row > 0.15]
if (length(correlations_over_015) > 0) {
cat(row_name, "positive correlations over 0.15:\n")
print(correlations_over_015)
}
cat("\n")
# Finding correlations under -0.15
correlations_under_015 = current_row[current_row < -0.15]
if (length(correlations_under_015) > 0) {
cat(row_name, "negative correlations under -0.15:\n")
print(correlations_under_015)
}
cat("\n\n")
}
TurnoverReason_Layoff negative correlations under -0.15:
PerformanceRating
-0.1662448
TurnoverReason_Resignation positive correlations over 0.15:
MaritalStatus_Single OverTime_Yes
0.1760980 0.2373928
TurnoverReason_Resignation negative correlations under -0.15:
Age MonthlyIncome TotalWorkingYears YearsAtCompany YearsInCurrentRole YearsWithCurrManager JobLevel OverTime_No
-0.1889709 -0.1701031 -0.1961135 -0.1652653 -0.1731110 -0.1613356 -0.1755028 -0.2373928
TurnoverReason_Retirement positive correlations over 0.15:
YearsAtCompany
0.177886
TurnoverReason_StillEmployed positive correlations over 0.15:
Age MonthlyIncome TotalWorkingYears YearsInCurrentRole YearsWithCurrManager JobLevel OverTime_No
0.1592050 0.1598396 0.1710632 0.1605450 0.1561993 0.1691048 0.2461180
TurnoverReason_StillEmployed negative correlations under -0.15:
MaritalStatus_Single JobRole_Sales - Sales Representative OverTime_Yes
-0.1754186 -0.1572343 -0.2461180
Code 22
# Calculating the median age
median_age <- median(data$Age, na.rm = TRUE)
# Creating the new column 'NewEmployeeGroup'
data$NewEmployeeGroup <- with(data,
Age <= median_age &
YearsInCurrentRole <= 10 &
JobLevel <= 2 &
OverTime == "Yes" &
MaritalStatus == "Single")
# Creating a new dataset with only 'TurnoverReason' and 'NewEmployeeGroup'
group_df <- data[, c("TurnoverReason", "NewEmployeeGroup")]
data <- subset(data, select = -NewEmployeeGroup)
group_df <- subset(group_df, TurnoverReason %in% c("Resignation", "StillEmployed"))
group_df$TurnoverReason <- ifelse(group_df$TurnoverReason == "Resignation", TRUE, FALSE)
# Renaming the column
names(group_df)[names(group_df) == "TurnoverReason"] <- "Resignation"
# Converting the table to a dataframe for plotting
cont_table <- table(Resignation = group_df$Resignation, NewEmployeeGroup = group_df$NewEmployeeGroup)
cont_table_with_margins <- addmargins(cont_table)
print(cont_table_with_margins)
NewEmployeeGroup
Resignation FALSE TRUE Sum
FALSE 1207 26 1233
TRUE 172 42 214
Sum 1379 68 1447
# Performing the Chi-Squared test
chi_squared_test <- chisq.test(cont_table)
# Printing the results
print(chi_squared_test)
Pearson's Chi-squared test with Yates' continuity correction
data: cont_table
X-squared = 121.06, df = 1, p-value < 2.2e-16
Code 23
# Filter the dataset
logReg_data <- subset(data, TurnoverReason %in% c("Resignation", "StillEmployed"))
# Rebuild the factor with just the remaining levels
logReg_data$TurnoverReason <- factor(logReg_data$TurnoverReason)
# Map 'TurnoverReason' to bolean and rename the column
logReg_data$TurnoverReason <- ifelse(logReg_data$TurnoverReason == "Resignation", TRUE, FALSE)
names(logReg_data)[names(logReg_data) == "TurnoverReason"] <- "Resignation"
# Perform logistic regression
model <- glm(Resignation ~ ., data = logReg_data, family = "binomial")
# Print the summary of the model
cat("Logistic Regression for\n")
Logistic Regression for
print(summary(model))
Call:
glm(formula = Resignation ~ ., family = "binomial", data = logReg_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.283e+00 1.535e+00 1.487 0.13698
Age -3.658e-02 1.598e-02 -2.289 0.02208 *
GenderMale 6.230e-01 2.211e-01 2.817 0.00484 **
MaritalStatusMarried 3.397e-01 3.277e-01 1.037 0.29990
MaritalStatusSingle 6.905e-01 4.614e-01 1.497 0.13446
EducationFieldLife Sciences -7.674e-01 9.872e-01 -0.777 0.43700
EducationFieldMarketing -2.520e-01 1.038e+00 -0.243 0.80811
EducationFieldMedical -7.739e-01 9.838e-01 -0.787 0.43148
EducationFieldOther -8.394e-01 1.050e+00 -0.799 0.42408
EducationFieldTechnical Degree 3.973e-01 9.999e-01 0.397 0.69115
Education.L 3.145e-02 4.805e-01 0.065 0.94781
Education.Q -2.997e-01 4.129e-01 -0.726 0.46785
Education.C 3.217e-02 3.063e-01 0.105 0.91636
Education^4 2.960e-02 2.182e-01 0.136 0.89212
LocationTokyo -7.232e-01 2.877e-01 -2.514 0.01193 *
LocationZurich -5.240e-01 2.497e-01 -2.098 0.03590 *
DistanceFromHome 6.774e-02 1.297e-02 5.221 1.78e-07 ***
MonthlyIncome -1.451e-04 1.083e-04 -1.340 0.18019
PercentSalaryHike -4.057e-02 4.636e-02 -0.875 0.38154
StockOptionLevel.L -4.913e-01 3.927e-01 -1.251 0.21087
StockOptionLevel.Q 1.048e+00 3.536e-01 2.964 0.00304 **
StockOptionLevel.C 3.643e-02 3.286e-01 0.111 0.91175
JobRoleHR - Manager -1.459e+01 9.949e+02 -0.015 0.98830
JobRoleR&D - Healthcare Representative -4.652e-01 8.409e-01 -0.553 0.58009
JobRoleR&D - Laboratory Technician 1.896e-01 6.388e-01 0.297 0.76661
JobRoleR&D - Manager -2.696e-01 1.341e+00 -0.201 0.84063
JobRoleR&D - Manufacturing Director -3.521e-01 8.467e-01 -0.416 0.67755
JobRoleR&D - Research Director -2.599e+00 1.617e+00 -1.607 0.10799
JobRoleR&D - Research Scientist -8.246e-01 6.474e-01 -1.274 0.20274
JobRoleSales - Manager -1.462e+01 5.348e+02 -0.027 0.97819
JobRoleSales - Sales Executive 8.551e-01 7.513e-01 1.138 0.25502
JobRoleSales - Sales Representative 7.824e-01 6.962e-01 1.124 0.26107
JobLevel.L 2.800e+00 1.461e+00 1.917 0.05518 .
JobLevel.Q 1.465e+00 7.126e-01 2.055 0.03985 *
JobLevel.C -1.587e-01 6.085e-01 -0.261 0.79421
JobLevel^4 1.129e+00 4.517e-01 2.498 0.01248 *
WorkLifeBalance.L -7.493e-01 3.344e-01 -2.241 0.02504 *
WorkLifeBalance.Q 8.053e-01 2.746e-01 2.933 0.00336 **
WorkLifeBalance.C 2.352e-01 2.017e-01 1.167 0.24341
BusinessTravel.L 1.626e+00 3.482e-01 4.670 3.01e-06 ***
BusinessTravel.Q 2.822e-03 2.218e-01 0.013 0.98985
OverTimeYes 2.329e+00 2.381e-01 9.779 < 2e-16 ***
PerformanceRating.L -8.936e-01 7.896e-01 -1.132 0.25780
PerformanceRating.Q -7.568e-02 5.988e-01 -0.126 0.89944
PerformanceRating.C 1.132e+00 3.739e-01 3.027 0.00247 **
TrainingTimesLastYear -1.798e-01 8.421e-02 -2.135 0.03275 *
JobInvolvement.L -1.620e+00 3.772e-01 -4.295 1.75e-05 ***
JobInvolvement.Q 2.428e-01 3.052e-01 0.795 0.42644
JobInvolvement.C -2.516e-01 2.044e-01 -1.231 0.21842
TotalWorkingYears -8.981e-02 3.571e-02 -2.515 0.01191 *
YearsAtCompany -3.333e-03 5.448e-02 -0.061 0.95123
YearsInCurrentRole -1.530e-01 6.138e-02 -2.492 0.01270 *
YearsSinceLastPromotion 2.416e-01 5.295e-02 4.562 5.06e-06 ***
YearsWithCurrManager -7.108e-02 5.981e-02 -1.188 0.23470
NumCompaniesWorked 2.018e-01 4.750e-02 4.249 2.14e-05 ***
RelationshipSatisfaction.L -8.495e-01 2.151e-01 -3.949 7.86e-05 ***
RelationshipSatisfaction.Q 4.327e-01 2.195e-01 1.972 0.04865 *
RelationshipSatisfaction.C -2.378e-01 2.173e-01 -1.094 0.27399
EnvironmentSatisfaction.L -1.085e+00 2.186e-01 -4.962 6.96e-07 ***
EnvironmentSatisfaction.Q 3.503e-01 2.126e-01 1.647 0.09947 .
EnvironmentSatisfaction.C -3.004e-01 2.163e-01 -1.389 0.16478
JobSatisfaction.L -1.038e+00 2.140e-01 -4.850 1.24e-06 ***
JobSatisfaction.Q 1.516e-02 2.131e-01 0.071 0.94331
JobSatisfaction.C -4.156e-01 2.155e-01 -1.929 0.05377 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1212.69 on 1446 degrees of freedom
Residual deviance: 651.16 on 1383 degrees of freedom
AIC: 779.16
Number of Fisher Scoring iterations: 16
Code 24
# Select columns
lr_data <- subset(data, select = -TurnoverReason)
# Build the linear regression model
model <- lm(MonthlyIncome ~.,
data = lr_data)
# Print the model summary
print(summary(model))
Call:
lm(formula = MonthlyIncome ~ ., data = lr_data)
Residuals:
Min 1Q Median 3Q Max
-3127.0 -658.5 -59.6 617.4 4554.1
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7817.7442 366.0873 21.355 < 2e-16 ***
Age -2.5968 4.3857 -0.592 0.553874
GenderMale 79.0595 57.2833 1.380 0.167760
MaritalStatusMarried 63.1492 76.1518 0.829 0.407102
MaritalStatusSingle 101.5424 123.3403 0.823 0.410494
EducationFieldLife Sciences -129.3146 277.9932 -0.465 0.641880
EducationFieldMarketing -76.2878 296.0163 -0.258 0.796665
EducationFieldMedical -134.2951 278.8174 -0.482 0.630123
EducationFieldOther -150.7106 298.8355 -0.504 0.614111
EducationFieldTechnical Degree -74.1742 290.0110 -0.256 0.798171
Education.L -116.6871 116.0287 -1.006 0.314745
Education.Q -9.1632 99.6937 -0.092 0.926780
Education.C -185.2105 76.9916 -2.406 0.016275 *
Education^4 -22.4112 56.3892 -0.397 0.691104
LocationTokyo 11.7510 78.6569 0.149 0.881263
LocationZurich -24.8824 68.8438 -0.361 0.717830
DistanceFromHome -2.6457 3.4674 -0.763 0.445585
PercentSalaryHike 14.6971 12.1597 1.209 0.226991
StockOptionLevel.L -85.1296 103.5528 -0.822 0.411166
StockOptionLevel.Q -123.4194 87.4324 -1.412 0.158289
StockOptionLevel.C 90.1082 72.8371 1.237 0.216249
JobRoleHR - Manager 3992.4068 398.0443 10.030 < 2e-16 ***
JobRoleR&D - Healthcare Representative 860.7167 219.1190 3.928 8.98e-05 ***
JobRoleR&D - Laboratory Technician -392.0763 197.6170 -1.984 0.047448 *
JobRoleR&D - Manager 4320.4732 281.2122 15.364 < 2e-16 ***
JobRoleR&D - Manufacturing Director 778.7766 216.7329 3.593 0.000338 ***
JobRoleR&D - Research Director 4257.1670 256.3557 16.606 < 2e-16 ***
JobRoleR&D - Research Scientist -316.2068 196.8697 -1.606 0.108460
JobRoleSales - Manager 4095.3726 297.8399 13.750 < 2e-16 ***
JobRoleSales - Sales Executive 844.1697 210.4127 4.012 6.34e-05 ***
JobRoleSales - Sales Representative -634.8012 224.4518 -2.828 0.004747 **
JobLevel.L 9132.6124 215.1821 42.441 < 2e-16 ***
JobLevel.Q 600.0114 110.0116 5.454 5.81e-08 ***
JobLevel.C -833.6343 91.3683 -9.124 < 2e-16 ***
JobLevel^4 -12.9399 80.8157 -0.160 0.872812
WorkLifeBalance.L 22.9439 100.8067 0.228 0.819988
WorkLifeBalance.Q -55.0219 81.9521 -0.671 0.502082
WorkLifeBalance.C 24.8944 56.6545 0.439 0.660435
BusinessTravel.L 89.2420 77.0618 1.158 0.247037
BusinessTravel.Q -53.7801 52.2010 -1.030 0.303070
OverTimeYes 73.4496 62.6495 1.172 0.241240
PerformanceRating.L 13.0635 218.1599 0.060 0.952259
PerformanceRating.Q 6.6977 176.6925 0.038 0.969768
PerformanceRating.C -215.0435 115.1838 -1.867 0.062115 .
TrainingTimesLastYear -6.1246 21.8677 -0.280 0.779462
JobInvolvement.L -220.8724 100.4437 -2.199 0.028043 *
JobInvolvement.Q 129.5784 81.4796 1.590 0.111988
JobInvolvement.C 10.2440 55.3516 0.185 0.853199
TotalWorkingYears 36.0097 8.0485 4.474 8.29e-06 ***
YearsAtCompany -3.8039 9.9142 -0.384 0.701273
YearsInCurrentRole 20.1063 12.9512 1.552 0.120775
YearsSinceLastPromotion 8.1168 11.3090 0.718 0.473042
YearsWithCurrManager -8.8432 13.2031 -0.670 0.503106
NumCompaniesWorked 28.6874 12.5976 2.277 0.022924 *
RelationshipSatisfaction.L 20.5645 58.2751 0.353 0.724227
RelationshipSatisfaction.Q -53.4220 57.9769 -0.921 0.356980
RelationshipSatisfaction.C 18.1810 56.2584 0.323 0.746614
EnvironmentSatisfaction.L -26.0416 57.8699 -0.450 0.652778
EnvironmentSatisfaction.Q 12.7865 57.6156 0.222 0.824402
EnvironmentSatisfaction.C -13.9406 57.3070 -0.243 0.807837
JobSatisfaction.L -26.7299 56.9745 -0.469 0.639031
JobSatisfaction.Q 62.1505 57.5375 1.080 0.280250
JobSatisfaction.C 0.4462 57.7783 0.008 0.993840
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1055 on 1407 degrees of freedom
Multiple R-squared: 0.9519, Adjusted R-squared: 0.9497
F-statistic: 448.7 on 62 and 1407 DF, p-value: < 2.2e-16
# Generate predictions
predictions <- predict(model, lr_data)
# Create a plot
ggplot(lr_data, aes(x = MonthlyIncome, y = predictions)) +
geom_point(alpha = 0.7) +
geom_abline(intercept = 0, slope = 1) +
coord_fixed(ratio = 1, xlim = c(0, 22000), ylim = c(0, 22000)) +
xlab("Actual Monthly Income") +
ylab("Predicted Monthly Income") +
ggtitle("Actual vs Predicted Monthly Income")
Code 25
# Build the linear regression model
model <- lm(MonthlyIncome ~ + JobLevel + JobRole + TotalWorkingYears, data = lr_data)
# Print the model summary
print(summary(model))
Call:
lm(formula = MonthlyIncome ~ +JobLevel + JobRole + TotalWorkingYears,
data = lr_data)
Residuals:
Min 1Q Median 3Q Max
-3152.2 -659.4 -77.2 623.5 4374.4
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8119.432 199.053 40.790 < 2e-16 ***
JobLevel.L 9106.926 209.804 43.407 < 2e-16 ***
JobLevel.Q 601.046 106.500 5.644 2.00e-08 ***
JobLevel.C -813.930 88.193 -9.229 < 2e-16 ***
JobLevel^4 19.949 79.228 0.252 0.801241
JobRoleHR - Manager 3979.016 390.638 10.186 < 2e-16 ***
JobRoleR&D - Healthcare Representative 846.435 185.769 4.556 5.64e-06 ***
JobRoleR&D - Laboratory Technician -417.540 161.318 -2.588 0.009741 **
JobRoleR&D - Manager 4264.847 253.643 16.814 < 2e-16 ***
JobRoleR&D - Manufacturing Director 745.895 183.565 4.063 5.10e-05 ***
JobRoleR&D - Research Director 4217.642 227.798 18.515 < 2e-16 ***
JobRoleR&D - Research Scientist -343.477 160.190 -2.144 0.032184 *
JobRoleSales - Manager 4080.315 271.288 15.041 < 2e-16 ***
JobRoleSales - Sales Executive 841.188 171.156 4.915 9.90e-07 ***
JobRoleSales - Sales Representative -639.271 189.104 -3.381 0.000743 ***
TotalWorkingYears 40.304 6.031 6.683 3.32e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1055 on 1454 degrees of freedom
Multiple R-squared: 0.9503, Adjusted R-squared: 0.9497
F-statistic: 1852 on 15 and 1454 DF, p-value: < 2.2e-16
# Generate predictions
predictions <- predict(model, lr_data)
# Create a plot
ggplot(lr_data, aes(x = MonthlyIncome, y = predictions, color = JobRole)) +
geom_point(alpha = 0.7) +
geom_abline(intercept = 0, slope = 1) +
coord_fixed(ratio = 1, xlim = c(0, 22000), ylim = c(0, 22000)) +
xlab("Actual Monthly Income") +
ylab("Predicted Monthly Income") +
ggtitle("Actual vs Predicted Monthly Income")
# Create a plot
ggplot(lr_data, aes(x = MonthlyIncome, y = predictions, color = JobLevel)) +
geom_point(alpha = 0.7) +
geom_abline(intercept = 0, slope = 1) +
coord_fixed(ratio = 1, xlim = c(0, 22000), ylim = c(0, 22000)) +
xlab("Actual Monthly Income") +
ylab("Predicted Monthly Income") +
ggtitle("Actual vs Predicted Monthly Income")
Code 26
# Perform ANOVA
aov <- aov(PercentSalaryHike ~ JobRole, data=data)
# print the summary
summary(aov)
Df Sum Sq Mean Sq F value Pr(>F)
JobRole 10 133 13.31 0.994 0.447
Residuals 1459 19544 13.40
# Create a new dataframe with observed and predicted values
plot_data <- data.frame(Observed = data$PercentSalaryHike, Predicted = aov$fitted.values, JobRole = data$JobRole)
# Create the scatterplot
ggplot(plot_data, aes(x = Observed, y = Predicted, color = JobRole)) +
geom_point(alpha = 0.7) +
geom_abline(intercept = 0, slope = 1) +
coord_fixed(ratio = 1) +
coord_fixed(ratio = 1, xlim = c(10, 27), ylim = c(10, 27)) +
xlab("Observed PercentSalaryHike") +
ylab("Predicted PercentSalaryHike") +
ggtitle("Observed vs Predicted PercentSalaryHike by PerformanceRating") +
scale_color_discrete(name = "PerformanceRating")
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Code 27
# Perform Shapiro-Wilk normality test
shapiro_test <- shapiro.test(data$PercentSalaryHike)
print(shapiro_test)
Shapiro-Wilk normality test
data: data$PercentSalaryHike
W = 0.90061, p-value < 2.2e-16
# Perform Welch Two Sample t-test
t_test <- t.test(PercentSalaryHike ~ OverTime, data=data)
print(t_test)
Welch Two Sample t-test
data: PercentSalaryHike by OverTime
t = 0.20508, df = 737.71, p-value = 0.8376
alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
95 percent confidence interval:
-0.3782954 0.4665489
sample estimates:
mean in group No mean in group Yes
15.22201 15.17788
Code 28
# Perform Linear regression
reg <- lm(PercentSalaryHike ~ PerformanceRating, data=data)
summary(reg)
Call:
lm(formula = PercentSalaryHike ~ PerformanceRating, data = data)
Residuals:
Min 1Q Median 3Q Max
-3.6923 -2.0146 -0.0146 1.2876 5.3333
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 16.0558 0.1810 88.73 < 2e-16 ***
PerformanceRating.L 4.8790 0.4487 10.88 < 2e-16 ***
PerformanceRating.Q 4.4303 0.3619 12.24 < 2e-16 ***
PerformanceRating.C 1.3670 0.2463 5.55 3.38e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.32 on 1466 degrees of freedom
Multiple R-squared: 0.5991, Adjusted R-squared: 0.5983
F-statistic: 730.2 on 3 and 1466 DF, p-value: < 2.2e-16
# Create a new dataframe with observed and predicted values
plot_data <- data.frame(Observed = data$PercentSalaryHike, Predicted = reg$fitted.values, PerformanceRating = data$PerformanceRating)
# Create the scatterplot
ggplot(plot_data, aes(x = Observed, y = Predicted, color = PerformanceRating)) +
geom_point(alpha = 0.7) +
geom_abline(intercept = 0, slope = 1) +
coord_fixed(ratio = 1) +
coord_fixed(ratio = 1, xlim = c(10, 27), ylim = c(10, 27)) +
xlab("Observed PercentSalaryHike") +
ylab("Predicted PercentSalaryHike") +
ggtitle("Observed vs Predicted PercentSalaryHike by PerformanceRating") +
scale_color_discrete(name = "PerformanceRating")
Coordinate system already present. Adding new coordinate system, which will replace the existing one.